Project Work

Data Preparation

Loading libraries

library(dplyr)
library(readr)
library(broom)
library(ggplot2)
library(tidymodels) 
library(readxl)
library(lubridate)
library(ISLR)
library(vip)
library(cluster)
library(rpart.plot) # install.packages('rpart.plot')

tidymodels_prefer() # Resolves conflicts, prefers tidymodel functions

1. Regression

Research question: How do we predict the price of houses in Melbourne?

# Melbourne Housing Data

melb <- read_excel("melb_data.xlsx")

a. Data Exploration

i. Data Cleaning: Remove NA values

melbclean <- melb %>% select(-Longtitude, -Lattitude, -Address, -Postcode, -SellerG) %>%
  mutate(Date = dmy(Date), logPrice = log(Price))

sapply(melbclean, function(x) sum(is.na(x)))
##        Suburb         Rooms          Type         Price        Method 
##             0             0             0             0             0 
##          Date      Distance      Bedroom2      Bathroom           Car 
##             0             0             0             0            62 
##      Landsize  BuildingArea     YearBuilt   CouncilArea    Regionname 
##             0          6450          5375          1369             0 
## Propertycount      logPrice 
##             0             0
melbclean <- melbclean %>% na.omit()

head(melbclean)
## # A tibble: 6 × 17
##   Suburb   Rooms Type   Price Method Date       Distance Bedroom2 Bathroom   Car
##   <chr>    <dbl> <chr>  <dbl> <chr>  <date>        <dbl>    <dbl>    <dbl> <dbl>
## 1 Abbotsf…     2 h     1.03e6 S      2016-02-04      2.5        2        1     0
## 2 Abbotsf…     3 h     1.46e6 SP     2017-03-04      2.5        3        2     0
## 3 Abbotsf…     4 h     1.6 e6 VB     2016-06-04      2.5        3        1     2
## 4 Abbotsf…     3 h     1.88e6 S      2016-05-07      2.5        4        2     0
## 5 Abbotsf…     2 h     1.64e6 S      2016-10-08      2.5        2        1     2
## 6 Abbotsf…     2 h     1.10e6 S      2016-10-08      2.5        3        1     2
## # … with 7 more variables: Landsize <dbl>, BuildingArea <dbl>, YearBuilt <dbl>,
## #   CouncilArea <chr>, Regionname <chr>, Propertycount <dbl>, logPrice <dbl>

ii. Distribution of house prices

# Exploratory plots
# Right-skewed distribution

ggplot(melbclean, aes(x = Price)) +
    geom_histogram(fill= 'lightblue') +
    theme_classic() +
    labs(x = "Price of houses in Melbourne (Australian dollars)", y = 'Count')

The distribution of house prices in Melbourne is right-skewed. The majority of the prices lie between 0 and 1500000.

It is also very interesting to see how distance from Sydney Central Business District affects the price

iii. Exploratory plots between House Price and Distance from Sydney Central Business District

# Scatterplot of Price and Distance
ggplot(melbclean, aes(x = Distance, y = Price)) +
    geom_point(color = 'pink') +
    geom_smooth() +
    theme_classic() +
    labs(x = "Distance from Sydney Central Business District (km)", y = "Price of houses in Melbourne (Australian dollars)")

melbclean <- melbclean %>% filter(Price < 6000000)

After removing the Price outliers:

# Scatterplot of Price and Distance
ggplot(melbclean, aes(x = Distance, y = Price)) +
    geom_point(color = 'pink') +
    geom_smooth() +
    theme_classic() +
    labs(x = "Distance from Sydney Central Business District (km)", y = "Price of houses in Melbourne (Australian dollars)")

# Scatterplot of Year and Type of Houses
ggplot(melbclean, aes(x = Type, y = YearBuilt)) +
    geom_boxplot(color = 'blue') +
    geom_smooth() +
    theme_classic() +
    labs(x = "Type of Houses", y = "The Year the House is built")

melbclean <- melbclean %>% filter(YearBuilt > 1800)

After removing the YearBuilt outlier:

# Scatterplot of Year and Type of Houses
ggplot(melbclean, aes(x = Type, y = YearBuilt)) +
    geom_boxplot(color = 'blue') +
    geom_smooth() +
    theme_classic() +
    labs(x = "Type of Houses", y = "The Year the House is built")

b. Data Pre-processing for Modelling

i. Create CV folds

melbclean_cv <- vfold_cv(melbclean, v = 10)

c. Simple linear regression - assuming linearity

i. Model Specification

lm_spec <-
    linear_reg() %>% 
    set_engine(engine = 'lm') %>% 
    set_mode('regression')

ii. Recipe and Workflow

full_rec <- recipe(logPrice ~ ., data = melbclean) %>%
    step_rm(CouncilArea, Price, Regionname, Method, Suburb, Date) %>%
    step_nzv(all_predictors()) %>% # removes variables with the same value
    step_normalize(all_numeric_predictors()) %>% # important standardization step for LASSO
    step_dummy(all_nominal_predictors())  

melb_wf <- workflow() %>%
  add_recipe(full_rec) %>%
  add_model(lm_spec)

iii. Fit and Tune Models

mod_initial <- fit(melb_wf, data = melbclean)

mod1_cv <- fit_resamples(melb_wf,
  resamples = melbclean_cv, 
  metrics = metric_set(rmse, rsq, mae)
)
mod1_cv %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <fct>               
## 1 mae     standard   0.254    10 0.00287 Preprocessor1_Model1
## 2 rmse    standard   0.326    10 0.00623 Preprocessor1_Model1
## 3 rsq     standard   0.640    10 0.0106  Preprocessor1_Model1

d. LASSO - selecting variables that matter

i. Model specification, recipe, and workflow

set.seed(244)

lm_lasso_spec_tune <- 
  linear_reg() %>%
  set_args(mixture = 1, penalty = tune()) %>% 
  set_engine(engine = 'glmnet') %>%
  set_mode('regression') 

lasso_wf <- workflow() %>% 
  add_recipe(full_rec) %>%
  add_model(lm_lasso_spec_tune) 

penalty_grid <- grid_regular(
  penalty(range = c(-5, 1)), 
  levels = 30)

lasso_fit_cv <- tune_grid( # new function for tuning hyperparameters
  lasso_wf, # workflow
  resamples = melbclean_cv, # folds
  metrics = metric_set(rmse, mae, rsq),
  grid = penalty_grid # penalty grid
)

ii. Selecting the best metric using the select_best method

lasso_fit_cv %>% select_best(metric = 'rmse') %>% arrange(desc(penalty))
## # A tibble: 1 × 2
##   penalty .config              
##     <dbl> <fct>                
## 1 0.00001 Preprocessor1_Model01
lasso_fit_cv %>% select_best(metric = 'rsq') %>% arrange(desc(penalty))
## # A tibble: 1 × 2
##   penalty .config              
##     <dbl> <fct>                
## 1 0.00001 Preprocessor1_Model01

iii. Selecting the best metric using the 1 standard error method

lasso_fit_cv %>% 
  select_by_one_std_err(metric = 'rmse', desc(penalty))
## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n std_err .config            .best .bound
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <fct>              <dbl>  <dbl>
## 1  0.0204 rmse    standard   0.330    10 0.00576 Preprocessor1_Mod… 0.326  0.333
lasso_fit_cv %>% 
  select_by_one_std_err(metric = 'rsq', desc(penalty))
## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n std_err .config            .best .bound
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <fct>              <dbl>  <dbl>
## 1  0.0329 rsq     standard   0.633    10  0.0103 Preprocessor1_Mod… 0.640  0.630

iv. Tuning the lasso model using the penalty with the best R-squared acquired from the 1 standard error method

best_penalty <- lasso_fit_cv %>% 
  select_by_one_std_err(metric = 'rmse', desc(penalty))

tuned_lasso_wf <- finalize_workflow(lasso_wf, best_penalty)

lm_lasso_spec <- tuned_lasso_wf %>% pull_workflow_spec() # Save final tuned model spec

v. Plotting the penalty for mae, rmse, and rsq

lasso_fit_cv %>% autoplot() + theme_classic()

vi. List of variables by importance using the selected best penalty

lasso_fit <- tuned_lasso_wf %>% 
  fit(data = melbclean) 

tidy(lasso_fit) %>% arrange(desc(abs(estimate)))
## # A tibble: 12 × 3
##    term          estimate penalty
##    <chr>            <dbl>   <dbl>
##  1 (Intercept)    13.8     0.0204
##  2 Type_u         -0.260   0.0204
##  3 Rooms           0.134   0.0204
##  4 YearBuilt      -0.129   0.0204
##  5 Distance       -0.125   0.0204
##  6 Bathroom        0.110   0.0204
##  7 BuildingArea    0.102   0.0204
##  8 Car             0.0190  0.0204
##  9 Bedroom2        0       0.0204
## 10 Landsize        0       0.0204
## 11 Propertycount   0       0.0204
## 12 Type_t          0       0.0204

vii. Examining output: plot of coefficient paths

glmnet_output <- lasso_fit %>% extract_fit_parsnip() %>% pluck('fit') # way to get the original glmnet output

lambdas <- glmnet_output$lambda
coefs_lambdas <- 
  coefficients(glmnet_output, s = lambdas )  %>% 
  as.matrix() %>%  
  t() %>% 
  as.data.frame() %>% 
  mutate(lambda = lambdas ) %>% 
  select(lambda, everything(), -`(Intercept)`) %>% 
  pivot_longer(cols = -lambda, 
               names_to = "term", 
               values_to = "coef") %>%
  mutate(var = map_chr(stringr::str_split(term,"_"),~.[1]))

coefs_lambdas %>%
  ggplot(aes(x = lambda, y = coef, group = term, color = var)) +
  geom_line() +
  geom_vline(xintercept = best_penalty %>% pull(penalty), linetype = 'dashed') + 
  theme_classic() + 
  theme(legend.position = "bottom", legend.text=element_text(size=8))

glmnet_output_1 <- lasso_fit %>% extract_fit_engine()
    
# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude <- glmnet_output_1$beta==0

# Loop over each variable
var_imp <- sapply(seq_len(nrow(bool_predictor_exclude)), function(row) {
    this_coeff_path <- bool_predictor_exclude[row,]
    if(sum(this_coeff_path) == ncol(bool_predictor_exclude)){ return(0)}else{
    return(ncol(bool_predictor_exclude) - which.min(this_coeff_path) + 1)}
})

# Create a dataset of this information and sort
var_imp_data <- tibble(
    var_name = rownames(bool_predictor_exclude),
    var_imp = var_imp
)
var_imp_data %>% arrange(desc(var_imp))
## # A tibble: 11 × 2
##    var_name      var_imp
##    <chr>           <dbl>
##  1 Rooms              61
##  2 BuildingArea       58
##  3 Type_u             58
##  4 Bathroom           55
##  5 YearBuilt          55
##  6 Distance           51
##  7 Car                39
##  8 Type_t             30
##  9 Landsize           28
## 10 Bedroom2           26
## 11 Propertycount      25

e. Residual plots - checking residual pattern to decide whether to update the model

mod1_output <- mod_initial%>% 
    predict(new_data = melbclean) %>% #this function maintains the row order of the new_data
    bind_cols(melbclean) %>%
    mutate(resid = logPrice - .pred)

mod1_output %>% filter(resid < -2.5)
## # A tibble: 1 × 19
##   .pred Suburb   Rooms Type   Price Method Date       Distance Bedroom2 Bathroom
##   <dbl> <chr>    <dbl> <chr>  <dbl> <chr>  <date>        <dbl>    <dbl>    <dbl>
## 1  18.0 Camberw…     5 h     2.61e6 S      2016-10-15      7.8        5        2
## # … with 9 more variables: Car <dbl>, Landsize <dbl>, BuildingArea <dbl>,
## #   YearBuilt <dbl>, CouncilArea <chr>, Regionname <chr>, Propertycount <dbl>,
## #   logPrice <dbl>, resid <dbl>
mod1_output <- mod1_output %>%
  mutate(colors = if_else(.pred > 17.8,1,0))

# Quick plot: Residuals vs. Fitted values
ggplot(mod1_output, aes(x = .pred, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") +
    theme_classic() +
    labs(x = "Fitted values", y = "Residuals")

# Residuals vs. Distance predictor
ggplot(mod1_output, aes(x= Distance, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") +
    theme_classic() +
    labs(x = "Distance", y = "Residuals")

# Residuals vs. Room predictor
ggplot(mod1_output, aes(x= YearBuilt, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") +
    theme_classic() +
    labs(x = "Year Built", y = "Residuals")

mod1_output %>% filter(.pred > 17)
## # A tibble: 1 × 20
##   .pred Suburb   Rooms Type   Price Method Date       Distance Bedroom2 Bathroom
##   <dbl> <chr>    <dbl> <chr>  <dbl> <chr>  <date>        <dbl>    <dbl>    <dbl>
## 1  18.0 Camberw…     5 h     2.61e6 S      2016-10-15      7.8        5        2
## # … with 10 more variables: Car <dbl>, Landsize <dbl>, BuildingArea <dbl>,
## #   YearBuilt <dbl>, CouncilArea <chr>, Regionname <chr>, Propertycount <dbl>,
## #   logPrice <dbl>, resid <dbl>, colors <dbl>

f. Updating the model for non-linearity

Method 1: Spline

set.seed(123)

full_rec <- recipe(logPrice ~ ., data = melbclean) %>%
    step_rm(CouncilArea,Price,Regionname,Method,Suburb,Date) %>%
    step_nzv(all_predictors()) %>% # removes variables with the same value
    step_normalize(all_numeric_predictors()) %>% # important standardization step for LASSO
    step_dummy(all_nominal_predictors())  %>%
    step_ns(Distance, deg_free = 5) %>%
    step_ns(Bathroom, deg_free = 5)


# Workflow (Recipe + Model)
spline_wf <- workflow() %>% 
  add_recipe(full_rec) %>%
  add_model(lm_spec) 


# CV to Evaluate
cv_output <- fit_resamples(
 spline_wf,
 resamples = melbclean_cv, # cv folds
 metrics = metric_set(mae,rsq,rmse)
)

cv_output %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <fct>               
## 1 mae     standard   0.253    10 0.00294 Preprocessor1_Model1
## 2 rmse    standard   0.325    10 0.00625 Preprocessor1_Model1
## 3 rsq     standard   0.645    10 0.00988 Preprocessor1_Model1
# Fit model
ns_mod <- spline_wf %>%
  fit(data = melbclean) 

ns_mod %>%
  tidy()
## # A tibble: 20 × 5
##    term          estimate std.error statistic    p.value
##    <chr>            <dbl>     <dbl>     <dbl>      <dbl>
##  1 (Intercept)    14.1      0.158      89.3    0        
##  2 Rooms           0.110    0.0141      7.80   7.08e- 15
##  3 Bedroom2        0.0199   0.0138      1.44   1.50e-  1
##  4 Car             0.0386   0.00473     8.15   4.27e- 16
##  5 Landsize        0.0118   0.00414     2.85   4.35e-  3
##  6 BuildingArea    0.107    0.00539    19.8    1.13e- 84
##  7 YearBuilt      -0.135    0.00523   -25.9    1.78e-140
##  8 Propertycount  -0.0110   0.00416    -2.64   8.38e-  3
##  9 Type_t         -0.0729   0.0160     -4.55   5.51e-  6
## 10 Type_u         -0.302    0.0133    -22.7    2.04e-109
## 11 Distance_ns_1  -0.271    0.0344     -7.86   4.55e- 15
## 12 Distance_ns_2  -0.271    0.0407     -6.65   3.21e- 11
## 13 Distance_ns_3  -0.833    0.0415    -20.0    1.23e- 86
## 14 Distance_ns_4  -1.01     0.0909    -11.1    3.61e- 28
## 15 Distance_ns_5  -0.894    0.0751    -11.9    2.84e- 32
## 16 Bathroom_ns_1   0.111    0.640       0.174  8.62e-  1
## 17 Bathroom_ns_2  -0.120    0.0992     -1.21   2.26e-  1
## 18 Bathroom_ns_3   0.614    0.224       2.74   6.15e-  3
## 19 Bathroom_ns_4  NA       NA          NA     NA        
## 20 Bathroom_ns_5  NA       NA          NA     NA
spline_mod_output <- melbclean %>%
  bind_cols(predict(ns_mod, new_data = melbclean)) %>%
    mutate(resid = logPrice - .pred)

 ggplot(spline_mod_output, aes(x = logPrice, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") +
    theme_classic()

ns_mod %>%
  tidy() %>% 
  mutate(absolute_estimate_spline = abs(estimate)) %>% 
  arrange(desc(absolute_estimate_spline))
## # A tibble: 20 × 6
##    term          estimate std.error statistic    p.value absolute_estimate_spli…
##    <chr>            <dbl>     <dbl>     <dbl>      <dbl>                   <dbl>
##  1 (Intercept)    14.1      0.158      89.3    0                         14.1   
##  2 Distance_ns_4  -1.01     0.0909    -11.1    3.61e- 28                  1.01  
##  3 Distance_ns_5  -0.894    0.0751    -11.9    2.84e- 32                  0.894 
##  4 Distance_ns_3  -0.833    0.0415    -20.0    1.23e- 86                  0.833 
##  5 Bathroom_ns_3   0.614    0.224       2.74   6.15e-  3                  0.614 
##  6 Type_u         -0.302    0.0133    -22.7    2.04e-109                  0.302 
##  7 Distance_ns_2  -0.271    0.0407     -6.65   3.21e- 11                  0.271 
##  8 Distance_ns_1  -0.271    0.0344     -7.86   4.55e- 15                  0.271 
##  9 YearBuilt      -0.135    0.00523   -25.9    1.78e-140                  0.135 
## 10 Bathroom_ns_2  -0.120    0.0992     -1.21   2.26e-  1                  0.120 
## 11 Bathroom_ns_1   0.111    0.640       0.174  8.62e-  1                  0.111 
## 12 Rooms           0.110    0.0141      7.80   7.08e- 15                  0.110 
## 13 BuildingArea    0.107    0.00539    19.8    1.13e- 84                  0.107 
## 14 Type_t         -0.0729   0.0160     -4.55   5.51e-  6                  0.0729
## 15 Car             0.0386   0.00473     8.15   4.27e- 16                  0.0386
## 16 Bedroom2        0.0199   0.0138      1.44   1.50e-  1                  0.0199
## 17 Landsize        0.0118   0.00414     2.85   4.35e-  3                  0.0118
## 18 Propertycount  -0.0110   0.00416    -2.64   8.38e-  3                  0.0110
## 19 Bathroom_ns_4  NA       NA          NA     NA                         NA     
## 20 Bathroom_ns_5  NA       NA          NA     NA                         NA

Method 2: GAM

set.seed(123)
# Generalized Additive Regression (GAM) Model
gam_spec <- 
  gen_additive_mod() %>%
  set_engine(engine = 'mgcv') %>%
  set_mode('regression') 
fit_gam_model <- gam_spec %>% # can't use a recipe with gam (yet)
  fit(Price ~ Type + s(Distance, k = 5) + s(Rooms, k = 5) + s(YearBuilt, k = 5) + s(Bathroom, k = 5), data = melbclean) # s() stands for splines  
fit_gam_model %>% pluck('fit') %>% summary() # estimates generalized CV error to choose smoothness, if not specified
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Price ~ Type + s(Distance, k = 5) + s(Rooms, k = 5) + s(YearBuilt, 
##     k = 5) + s(Bathroom, k = 5)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1131467       7961 142.120  < 2e-16 ***
## Typet        -166749      22052  -7.562 4.56e-14 ***
## Typeu        -212177      18970 -11.185  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F p-value    
## s(Distance)  3.559  3.884 146.4  <2e-16 ***
## s(Rooms)     3.769  3.961 131.5  <2e-16 ***
## s(YearBuilt) 3.606  3.898 120.5  <2e-16 ***
## s(Bathroom)  3.772  3.959 236.1  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.545   Deviance explained = 54.6%
## GCV = 1.9284e+11  Scale est. = 1.9229e+11  n = 6190

Based on the output, it looks like Type u (unit, duplex) and Type t (townhouse) have on average a price that is lower by 167K and 213K respectively compared to Type h (house, cottage, villa, semi, terrace), keeping all other factors fixed.

Based on the smooth terms, Distance, Rooms, YearBuilt, Bathroom seem to be strong predictors after considering other variables in the model. They seem to be wiggly curves with edf ranging from 3 to 4.

# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
par(mfrow=c(2,2))
fit_gam_model %>% pluck('fit') %>% mgcv::gam.check() 

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 12 iterations.
## The RMS GCV score gradient at convergence was 144146.1 .
## The Hessian was positive definite.
## Model rank =  19 / 19 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                k'  edf k-index p-value    
## s(Distance)  4.00 3.56    0.82  <2e-16 ***
## s(Rooms)     4.00 3.77    0.99    0.15    
## s(YearBuilt) 4.00 3.61    1.01    0.67    
## s(Bathroom)  4.00 3.77    0.96    0.01 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The residual plots look fine. There are no extreme patterns in the Q-Q plot (they are close to the line, which suggests the residuals are approximately Normal). There is no obvious pattern in the residual vs. linear pred. plot. The histogram of the residuals is unimodal and right-skewed. Lastly, the response v. fitted values are positively correlated as they should be.

For the number of knots, we see the edf (effective degree of freedom) is much lower than the k’ = k - 1 and all of the p-values are quite large. This indicates that there are enough knots for the wiggliness of the relationships.

fit_gam_model %>% pluck('fit') %>% plot(pages = 1)

The relationships between Melbourne house price and the predictors look to be at least slightly nonlinear for the chosen predictors. This suggests that forcing linear relationships would not have been ideal.

2. Classification

Research question: How do we predict which type of house it is (house, unit, townhouse) based on the exisiting predictors?

a. Data Exploration

i. Distribution of type of houses in terms of prices and building area

# Exploratory plots

ggplot(melbclean, aes(x = BuildingArea, y = logPrice, color = Type)) +
    geom_point() + 
    theme_classic() +
    labs(color = 'Type of Houses')

ii. Data Cleaning: Removing outliers and change Type to factor

# Remove outliers: Building Area that are greater than 1500

melbclean_classification <- melbclean %>% filter(BuildingArea < 1500)

# Change Type to factor
melbclean_classification <- melbclean_classification %>%
  mutate(Type = factor(Type)) %>% #make sure outcome is factor
  mutate(across(where(is.character), as.factor)) #change all character variables to factors

unique(melbclean_classification$Type)
## [1] h u t
## Levels: h t u

iii. More exploratory plots

# Exploratory plots

ggplot(melbclean_classification, aes(x = BuildingArea, y = logPrice, color = Type)) +
    geom_point() + 
    theme_classic() +
    labs(color = 'Type of Houses')

ggplot(melbclean_classification, aes(x = Type, y = logPrice)) +
  geom_boxplot() + 
    labs(x = "House Type", y = "Log of House Price") +
    scale_fill_viridis_d() +
    theme_classic() +
    theme(text = element_text(size = 20))

ggplot(melbclean_classification, aes(x = Type, y = Distance)) +
  geom_boxplot() + 
    labs(x = "House Type", y = "Distance") +
    scale_fill_viridis_d() +
    labs(x = 'House Type', y = 'Distance from Sydney Central Business District (km)') +
    theme_classic() +
    theme(text = element_text(size = 20)) 

ggplot(melbclean_classification, aes(x = Type, y = Landsize)) +
  geom_boxplot() + 
    labs(x = "House Type", y = "Landsize") +
    scale_fill_viridis_d() +
    theme_classic() +
    theme(text = element_text(size = 20))

iv. No Information Rate

melbclean_classification %>% group_by(Type) %>% summarize(count = n())
## # A tibble: 3 × 2
##   Type  count
##   <fct> <int>
## 1 h      4081
## 2 t       602
## 3 u      1505

b. Modelling: Decision Tree

i. Model Specification

ct_spec <- decision_tree() %>%
  set_engine(engine = 'rpart') %>%
  set_args(cost_complexity = NULL,  #default is 0.01 (used for pruning a tree)
           min_n = NULL, #min number of observations to try split: default is 20 [I think the documentation has a typo and says 2]  (used to stop early)
           tree_depth = NULL) %>% #max depth (max number of splits to get to any final group: default is 30 (used to stop early)
  set_mode('classification') # change this for regression tree

ii. Recipe and Workflow

type_predict <- recipe(Type ~ ., data = melbclean_classification) %>%
  #step_unknown(all_nominal_predictors()) %>%
  step_novel(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors())

type_wf <- workflow() %>%
  add_model(ct_spec) %>%
  add_recipe(type_predict)

iii. Model Building

fit_type <- type_wf %>%
  fit(data = melbclean_classification)

# Default tuning parameters (min_n = 20, depth = 30, cost_complexity = 0.01)
fit_type %>%
  extract_fit_engine() %>%
  rpart.plot()

iv. Visualizing metrics

type_output <-  bind_rows(
  predict(fit_type, new_data = melbclean_classification) %>% bind_cols(melbclean_classification %>% select(Type)) %>% mutate(model = 'orig'))

ct_metrics <- metric_set(sens, yardstick::spec, accuracy)

# No Information Rate (accuracy if you classified everyone using largest outcome group): Proportion of largest outcome group
melbclean_classification %>%
  count(Type) %>%
  mutate(prop = n/sum(n)) %>%
  filter(prop == max(prop))
## # A tibble: 1 × 3
##   Type      n  prop
##   <fct> <int> <dbl>
## 1 h      4081 0.660
# Training Data Metrics
metrics <- type_output %>% 
  group_by(model) %>%
  ct_metrics(estimate = .pred_class, truth = Type) 


metrics %>% filter(.metric == 'accuracy') %>% arrange(desc(.estimate))
## # A tibble: 1 × 4
##   model .metric  .estimator .estimate
##   <chr> <chr>    <chr>          <dbl>
## 1 orig  accuracy multiclass     0.848
metrics %>% filter(.metric == 'sens') %>% arrange(desc(.estimate))
## # A tibble: 1 × 4
##   model .metric .estimator .estimate
##   <chr> <chr>   <chr>          <dbl>
## 1 orig  sens    macro          0.729
metrics %>% filter(.metric == 'spec') %>% arrange(desc(.estimate))
## # A tibble: 1 × 4
##   model .metric .estimator .estimate
##   <chr> <chr>   <chr>          <dbl>
## 1 orig  spec    macro          0.906
metrics %>%
  ggplot(aes(x = .metric, y = .estimate, color = model)) +
  geom_point(size = 2) +
  geom_line(aes(group = model)) +
  theme_classic()

c. Modelling: Random Forest

i. Model Specification

set.seed(123)
rf_spec <- rand_forest() %>%
  set_engine(engine = 'ranger') %>% 
  set_args(mtry = NULL, 
           trees = 1000, 
           min_n = 2,
           probability = FALSE, 
           importance = 'impurity') %>% 
  set_mode('classification') 

ii. Recipe and Workflow

# Recipe
data_rec <- recipe(Type ~ ., data = melbclean_classification) %>% step_rm(Price)

# Workflows
data_wf <- workflow() %>%
  add_model(rf_spec ) %>%
  add_recipe(data_rec)

iii. Model Building

a. Evaluating OOB metrics
data_fit <- fit(data_wf, data = melbclean_classification)

rf_OOB_output <- function(fit_model, model_label, truth){
    tibble(
          .pred_class = fit_model %>% extract_fit_engine() %>% pluck('predictions'), #OOB predictions
          class = truth,
          model = model_label
      )
}

data_rf_OOB_output <- bind_rows(
    rf_OOB_output(data_fit,2, melbclean_classification %>% pull(Type))
)
b. Fitting data
data_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_rm()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~1000,      min.node.size = min_rows(~2, x), probability = ~FALSE, importance = ~"impurity",      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 
## 
## Type:                             Classification 
## Number of trees:                  1000 
## Sample size:                      6188 
## Number of independent variables:  15 
## Mtry:                             3 
## Target node size:                 2 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             11.88 %
c. Assessing variable importance
model_output <-
  data_fit %>% 
    extract_fit_engine() 

model_output %>% 
    vip(num_features = 10) + theme_classic() #based on impurity

model_output %>% vip::vi() %>% head()
## # A tibble: 6 × 2
##   Variable     Importance
##   <chr>             <dbl>
## 1 Landsize           586.
## 2 BuildingArea       430.
## 3 YearBuilt          397.
## 4 logPrice           355.
## 5 Rooms              245.
## 6 Bedroom2           218.
model_output %>% vip::vi() %>% tail()
## # A tibble: 6 × 2
##   Variable      Importance
##   <chr>              <dbl>
## 1 Propertycount      102. 
## 2 CouncilArea         87.7
## 3 Car                 78.3
## 4 Regionname          51.1
## 5 Bathroom            49.1
## 6 Method              39.0

3. Clustering

Research question: How many different clusters/types of houses in Melbourne based on the existing predictors?

a. Data Exploration

i. Distribution of Melbourne houses based on Distance and Building Area

ggplot(melbclean, aes(x = Distance, y = BuildingArea)) +
    geom_point(color = 'lightblue') +
    theme_classic() + 
    labs (x = 'Building Area', y = 'Distance from Sydney Central Business District (km)')

b. Picking the number of clusters: The Elbow Method

melbclean_clustering <- melbclean %>% 
  mutate(across(where(is.character), as.factor)) 

melbclean_clustering_kmeans <- melbclean_clustering %>% # Change all character variables to factors
  select(Rooms, YearBuilt, Distance, BuildingArea) # Select a subset of variables to perform k-means

# Data-specific function to cluster and calculate total within-cluster SS

house_cluster_ss <- function(k){
    # Perform clustering
    kclust <- kmeans(daisy(melbclean_clustering_kmeans), centers = k)

    # Return the total within-cluster sum of squares
    return(kclust$tot.withinss)
}

tibble(
    k = 1:15,
    tot_wc_ss = purrr::map_dbl(1:15, house_cluster_ss)
) %>% 
    ggplot(aes(x = k, y = tot_wc_ss)) +
    geom_point() + 
    labs(x = "Number of clusters", y = 'Total within-cluster sum of squares') + 
    theme_classic()

Test with 3 and 5.

c. Modelling: K-means Clustering

i. Create model with k = 3

# Run k-means for k = centers = 3
set.seed(253)
kclust_k3_scale <- kmeans(daisy(melbclean_clustering_kmeans), centers = 3)
melbclean_clustering <- melbclean_clustering %>%
    mutate(kclust_3_scale = factor(kclust_k3_scale$cluster))
# Visualize the cluster assignments with Distance and Building Area
ggplot(melbclean_clustering, aes(x = Distance, y = BuildingArea, color = kclust_3_scale)) +
    geom_point() + 
    labs (x = 'Distance from Sydney Central Business District (km)', y = 'Building Area') +
    theme_classic()

# Visualize the cluster assignments with Year Built and Building Area
ggplot(melbclean_clustering, aes(x = YearBuilt, y = BuildingArea, color = kclust_3_scale)) +
    geom_point() + 
    labs (x = 'Year Built', y = 'Building Area') +
    theme_classic()

# Visualize the cluster assignments with Type and Distance
ggplot(melbclean_clustering, aes(x = Type, y = Distance, color = kclust_3_scale)) +
    geom_boxplot() + 
    labs (x = 'Type of Houses', y = 'Distance from Sydney Central Business District (km)') +
    theme_classic()

ii. Create model with k = 5

# Run k-means for k = centers = 5
set.seed(253)
kclust_k5_scale <- kmeans(daisy(melbclean_clustering_kmeans), centers = 5)
melbclean_clustering <- melbclean_clustering %>%
    mutate(kclust_5_scale = factor(kclust_k5_scale$cluster))
# Visualize the cluster assignments with Distance and Building Area
ggplot(melbclean_clustering, aes(x = Distance, y = BuildingArea, color = kclust_5_scale)) +
    geom_point() + 
    labs (x = 'Distance from Sydney Central Business District (km)', y = 'Building Area') +
    theme_classic()

# Visualize the cluster assignments with Year Built and Building Area
ggplot(melbclean_clustering, aes(x = YearBuilt, y = BuildingArea, color = kclust_5_scale)) +
    geom_point() + 
    labs (x = 'Year Built', y = 'Building Area') +
    theme_classic()

# Visualize the cluster assignments with Type and Distance
ggplot(melbclean_clustering, aes(x = Type, y = Distance, color = kclust_5_scale)) +
    geom_boxplot() + 
    labs (x = 'Type of Houses', y = 'Distance from Sydney Central Business District (km)') +
    theme_classic()

d. Modelling - Hierarchical Clustering

set.seed(253)

melbclean_clustering_1 <- melbclean %>% 
  mutate(across(where(is.character), as.factor)) %>% # Change all character variables to factors
  select(-Date) # Remove the Date variable


house_sub <- melbclean_clustering_1 %>%
  sample_n(50)
dist_mat_scaled <- dist(daisy(house_sub))
house_cluster <- hclust(dist_mat_scaled, method = "complete")

plot(house_cluster)

melbclust <- house_sub %>%
  mutate(
    hclust_area3 = factor(cutree(house_cluster, h = 3)),
    hclust4 = cutree(house_cluster,k = 4))
ggplot(melbclust, aes(x= factor(hclust4), y = Distance))+
  geom_boxplot()+
  labs(x = "Cluster")+
  theme_classic()

ggplot(melbclust, aes(x= factor(hclust4), y = Landsize))+
  geom_boxplot()+
  labs(x = "Cluster")+
  theme_classic()

ggplot(melbclust, aes(x = factor(hclust4), fill = factor(Rooms)))+
  geom_bar(position = "fill") +
    labs(x = "Cluster") + 
    theme_classic()

ggplot(melbclust, aes(x = factor(hclust4), fill = factor(Type)))+
  geom_bar(position = "fill") +
    labs(x = "Cluster") + 
    theme_classic()